This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.
Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.
They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.
Import of the expression data and the N-responsive genes and regulators :
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
ALPHAS <- seq(0,1, by = 0.1)
subset <- sample(genes, replace = F, size = 20)
#subset <- genes
load("results/brf_perumtations_mse_all_genes.rdata")
load("results/100_permutations_bRF_importances.rdata")
draw_gene <- function(gene){
lmses[gene,] %>%
gather() %>%
separate(key, into = c("alpha", "rep", "MSEtype"), sep = " ") %>%
ggplot(aes(x=alpha, y=value, group =interaction(rep, MSEtype),
color = MSEtype))+ggtitle(gene)+ylab("MSE/Var(gene)")+
xlab("alpha")+
geom_line()+ggtitle(gene)+theme_pubr(legend = "top")
}
draw_gene_mean_sd <- function(gene, title = NULL){
lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("alpha", "rep", "MSEtype"),
sep = " ") %>%
group_by(alpha, MSEtype) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T)) %>%
ggplot(aes(
x = as.numeric(alpha),
y = value,
color = MSEtype,
fill = MSEtype
)) +ggtitle(paste(gene, title))+ylab("MSE/Var(gene)")+
geom_ribbon(aes(ymin = mean_mse - sd_mse ,
ymax = mean_mse + sd_mse ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha")
}
draw_gene_effective_integration <- function(gene){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])})) %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))%>%
ggplot(aes(x=alpha, y = summed_importance, color = dataset, fill = dataset)) +
geom_point(alpha = 0.1) + geom_smooth(se=F)+
geom_ribbon(aes(ymin = mean_imp - sd_imp ,
ymax = mean_imp + sd_imp ),
alpha = .4) +theme_pubr(legend = "top") +
ylab("effective data integration")
}
draw_MSE_vs_gene_effective_integration <- function(gene, return_cluster = F){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
if(return_cluster & length(tfs_with_motif) == 0)
return("not saved")
if(length(tfs_with_motif)==0)
return((ggplot()+ ggtitle ("no motif in this gene")))
inte <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])})) %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))
curves <- lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("alpha", "rep", "MSEtype"),
sep = " ") %>%
group_by(alpha, MSEtype) %>%
rename(dataset = MSEtype)%>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T),
alpha = as.numeric(alpha),
dataset = str_replace(dataset, "true_data", "trueData")) %>%
full_join(inte, by = c("alpha", "dataset", "rep")) %>%
group_by(dataset) %>%
dplyr::arrange(summed_importance) %>%
mutate(lower = loess.sd(x=summed_importance, y=value)$lower,
upper = loess.sd(x=summed_importance, y=value)$upper) %>%
mutate(lower = frollmean(lower, 1, fill = NA),
upper = frollmean(upper, 1, fill = NA))
true <- curves %>%
filter(dataset == "trueData")
shuff <- curves %>%
filter(dataset != "trueData")
true[paste0("shuffled_", c("mean_mse", "lower", "upper"))] <- shuff[c("mean_mse", "lower", "upper")]
dev <- true%>%
mutate(deviation = ifelse(shuffled_lower - upper < 0 , 0, shuffled_lower - upper))
cluster = "not saved"
if(sum(dev$deviation > 0, na.rm = T) > 10) cluster = "class of interest"
if(return_cluster) return(cluster)
curves %>%
ggplot(aes(y=value, x = summed_importance, color = dataset, fill = dataset))+
geom_ribbon(aes(ymin =lower ,
ymax = upper ),
alpha = .4)+ geom_point(alpha = 0.1, size = 0.5) +
theme_pubr(legend = "top") + geom_smooth(method = "loess", se = F)+
ylab("MSE/Var(gene)") + xlab("effective integration") + ggtitle(paste(gene, cluster))
}
# gene <-"AT1G08090"
# clusters_rf_eff <- sapply(genes, draw_MSE_vs_gene_effective_integration, return_cluster=T)
# table(clusters_rf_eff)
# # # save(clusters_rf_eff, file = "results/clusters_mse_eff_bRF_100permutations.rdata")
# #
# load("results/clusters_mse_bRF_100permutations.rdata")
# pos_genes <- names(which(clusters_rf==2))
# load("results/clusters_mse_eff_bRF_100permutations.rdata")
# pos_genes_eff <- names(which(clusters_rf_eff=="class of interest"))
#
# length(intersect(pos_genes, pos_genes_eff))
#
#
# dev%>%
# ggplot(aes(x=summed_importance, y=deviation)) + geom_point()+
# draw_MSE_vs_gene_effective_integration(gene)
for(gene in sample(genes, replace = F, size = 200)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+draw_MSE_vs_gene_effective_integration(gene))
}
#
#genes saved with new method
# for(gene in setdiff(pos_genes_eff, pos_genes)){
# print(draw_MSE_vs_gene_effective_integration(gene))
# }
# # genes lost with new method
# for(gene in sample(setdiff(pos_genes, pos_genes_eff), size = 10, replace = F)){
# print(draw_MSE_vs_gene_effective_integration(gene))
# }
#
# gene <- "AT5G01480"
load("results/clusters_mse_bRF_100permutations.rdata")
pos_genes <- names(which(clusters_rf==2))
for(gene in sample(pos_genes,10, replace = F)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+draw_MSE_vs_gene_effective_integration(gene))
}
load("results/lasso_perumtations_mse_all_genes.rdata")
load("results/100_permutations_lasso_importances_freq.rdata")
for(gene in subset){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+draw_MSE_vs_gene_effective_integration(gene))
}
load("results/clusters_mse_lasso_100permutations.rdata")
pos_genes <- names(which(clusters_lasso==1))
for(gene in sample(pos_genes,20, replace = F)){
print(draw_gene_effective_integration(gene)+draw_gene_mean_sd(gene)+draw_MSE_vs_gene_effective_integration(gene))
}